{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filters on the original amazon reviews dataset:\n",
    "1. coreset (reviewers who reviewed at least five things and products with at least five reviews)\n",
    "2. helpful (reviews with more helpful upvotes than unhelpful upvotes - requires at least one upvote)\n",
    "3. sentiment non-ambiguity (has to be rated 1, 3, or 5 -- no way to verify that a 2 is really a 2 ya know? its either positive middle or negative, but what really is a 4? so i drop all 2s and 4s)\n",
    "4. non-empty\n",
    "\n",
    "This results in ~ 10 million reviews."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# These imports enhance Python2/3 compatibility.\n",
    "from __future__ import print_function, absolute_import, division, unicode_literals, with_statement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from cleanlab.models.fasttext import FastTextClassifier, data_loader\n",
    "import cleanlab\n",
    "import numpy as np\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "import copy\n",
    "from datetime import datetime as dt\n",
    "import pickle\n",
    "import scipy\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"/media/ssd/datasets/datasets/amazon5core/\"\n",
    "data_fn = \"amazon5core.json\"\n",
    "write_fn = 'amazon5core.txt'\n",
    "write_dir = \"/home/curtis/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_already_trained = False\n",
    "need_to_prepare_data_for_fasttext = False\n",
    "if need_to_prepare_data_for_fasttext:\n",
    "    # Convert amazon dataset to fasttext format\n",
    "    # Only include reviews with more helpful votes than unhelpful votes\n",
    "    # This takes about 6 minutes.\n",
    "    labels = []\n",
    "    with open(data_dir + data_fn, 'r') as rf:\n",
    "        with open(write_dir + write_fn, 'w') as wf:\n",
    "#             for i in range(1000000):\n",
    "#                 d = json.loads(rf.readline())\n",
    "            for line in rf:\n",
    "                d = json.loads(line)\n",
    "                h = d['helpful']\n",
    "                if h[0] > h[1] // 2:\n",
    "                    label = int(d['overall'])\n",
    "                    if label in [1,3,5]:\n",
    "                        text = d['reviewText']\n",
    "                        if len(text) > 0:\n",
    "                            wf.write(\"__label__{} {}\\n\".format(\n",
    "                                label, \n",
    "                                text.strip().replace('\\n', ' __newline__ '),\n",
    "                            ))\n",
    "                            labels.append(label)                          \n",
    "    label_map = {1:0, 3:1, 5:2}\n",
    "    labels = [label_map[l] for l in labels]\n",
    "else:\n",
    "    labels = np.empty(9996437, dtype=int)\n",
    "    text = []\n",
    "    loc = write_dir + 'amazon5core.preprocessed.txt'\n",
    "    bs = 1000000\n",
    "    label_map = {'__label__1':0, '__label__3':1, '__label__5':2}\n",
    "    for i, (l, t) in enumerate(data_loader(loc, batch_size=bs)):\n",
    "        labels[bs*i:bs*(i+1)] = [label_map[lab] for lab in l]\n",
    "        if not tfidf_already_trained:\n",
    "            text.append(t)\n",
    "    if not tfidf_already_trained:\n",
    "        text = [t for lst in text for t in lst]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess the data by running this\n",
    "# cat amazon5core.txt | sed -e \"s/\\([.\\!?,'/()]\\)/ \\1 /g\" | tr \"[:upper:]\" \"[:lower:]\" > amazon5core.preprocessed.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Bag of words vectorizer on the entire corpus.\n",
    "\n",
    "if tfidf_already_trained:\n",
    "    with open('/home/curtis/amazon_text_vectorized.npz', 'rb') as rf:\n",
    "        X = scipy.sparse.load_npz(rf)\n",
    "else:\n",
    "    # Takes about 20 minutes\n",
    "    tfidf = TfidfVectorizer(\n",
    "        stop_words='english', \n",
    "        ngram_range=(1,1), # (1, 2) takes too much mem\n",
    "#         max_features=2000000,\n",
    "    )\n",
    "    X = tfidf.fit_transform(text)\n",
    "    with open('/home/curtis/amazon_text_vectorized.npz', 'wb') as wf:\n",
    "        scipy.sparse.save_npz(wf, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# text_already_vectorized = False\n",
    "# if text_already_vectorized:\n",
    "#     with open('/home/curtis/amazon_text_vectorized.npy', 'rb') as rf:\n",
    "#         text = np.load(rf)\n",
    "# else:\n",
    "#     # Takes a while, maybe 25 minutes.\n",
    "#     text = tfidf.transform(text)\n",
    "#     with open('/home/curtis/amazon_text_vectorized.npy', 'wb') as wf:\n",
    "#         np.save(wf, text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 6664290 3332147\n",
      "X_train 0:00:10\n",
      "X_holdout 0:00:16\n",
      "y_train 0:00:16\n",
      "y_holdout 0:00:16\n",
      "clf_fit 0:02:42\n",
      "pyx[cv_holdout_idx] 0:02:44\n",
      "1 6664291 3332146\n",
      "X_train 0:00:10\n",
      "X_holdout 0:00:15\n",
      "y_train 0:00:15\n",
      "y_holdout 0:00:15\n",
      "clf_fit 0:02:37\n",
      "pyx[cv_holdout_idx] 0:02:39\n",
      "2 6664293 3332144\n",
      "X_train 0:00:09\n",
      "X_holdout 0:00:15\n",
      "y_train 0:00:15\n",
      "y_holdout 0:00:15\n",
      "clf_fit 0:02:38\n",
      "pyx[cv_holdout_idx] 0:02:40\n"
     ]
    }
   ],
   "source": [
    "# Train data using cross-validation\n",
    "seed = 0\n",
    "cv_n_folds = 3\n",
    "n = len(labels)\n",
    "m = 3\n",
    "\n",
    "# Create cross-validation object for out-of-sample predicted probabilities.\n",
    "# CV folds preserve the fraction of noisy positive and\n",
    "# noisy negative examples in each class.\n",
    "kf = StratifiedKFold(n_splits = cv_n_folds, shuffle = True, random_state = seed)\n",
    "\n",
    "# Intialize out array (output of trained network)\n",
    "pyx = np.empty((n, m))\n",
    "\n",
    "# Split data into \"cv_n_folds\" stratified folds.\n",
    "for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(range(n), labels)):\n",
    "    print(k, len(cv_train_idx), len(cv_holdout_idx))\n",
    "    start = dt.now()\n",
    "    clf = SGDClassifier(\n",
    "        alpha=0.000001, loss='modified_huber',\n",
    "        max_iter=50, n_jobs=12,\n",
    "        penalty='l2', random_state=0, tol=0.0001,\n",
    "    )\n",
    "    X_train = X[cv_train_idx]\n",
    "    print(\"X_train\", str(dt.now() - start)[:-7])\n",
    "    X_holdout = X[cv_holdout_idx]\n",
    "    print(\"X_holdout\", str(dt.now() - start)[:-7])\n",
    "    y_train = labels[cv_train_idx]\n",
    "    print(\"y_train\", str(dt.now() - start)[:-7])\n",
    "    y_holdout = labels[cv_holdout_idx]\n",
    "    print(\"y_holdout\", str(dt.now() - start)[:-7])\n",
    "    clf.fit(X_train, y_train)\n",
    "    print(\"clf_fit\", str(dt.now() - start)[:-7])\n",
    "    pyx[cv_holdout_idx] = clf.predict_proba(X_holdout)\n",
    "    print(\"pyx[cv_holdout_idx]\", str(dt.now() - start)[:-7])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stored results directory\n",
    "pyx_dir = '/media/ssd/datasets/pyx/amazon/'\n",
    "\n",
    "# Write out\n",
    "with open(write_dir + 'amazon_pyx_tfidf_cv_{}fold.npy'.format(cv_n_folds), 'wb') as wf:\n",
    "    np.save(wf, pyx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load in pyx\n",
    "with open(write_dir + 'amazon_pyx_tfidf_cv_{}fold.npy'.format(cv_n_folds), 'rb') as rf:\n",
    "    pyx = np.load(rf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8703535069545278"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check that probabilities are good.\n",
    "accuracy_score(labels, np.argmax(pyx, axis = 1))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
